from sklearn.linear_model import LogisticRegression
import pandas as pd
from utils import get_dataset
from models.LRBinsModel import LRBinsModel
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import tqdm
import pickle
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('datasetname', type=str, help='the name of the dataset to process')
args = parser.parse_args()
datasetname = args.datasetname

data = pd.read_csv(f"data/{datasetname}.csv")
with open(f"hyperparameters/{datasetname}.p", "rb") as fp:
    hyperparameters = pickle.load(fp)

paperlracc = []
paperlrrocauc = []
paperlrwbinsacc = []
paperlrwbinsrocauc = []
paperxgbacc = []
paperxgbrocauc = []

num_runs = 20
for seed in tqdm(range(num_runs)):
    X_train, X_val, X_test, y_train, y_val, y_test, feature_names = get_dataset(
        data, normalize=True, random_state=seed
    )

    num_features = X_train.shape[1]

    # xgboost model
    clf = XGBClassifier(max_depth=hyperparameters["xgb_max_depth"], n_estimators=hyperparameters["xgb_n_estimators"])
    clf.fit(X_train, y_train)
    y_probs = clf.predict_proba(X_test)[:, 1]
    y_preds = clf.predict(X_test)
    y_test = y_test.astype(int)
    y_preds = y_preds.astype(int)
    roc_score = roc_auc_score(y_test, y_probs)
    acc_score = accuracy_score(y_test, y_preds)
    paperxgbrocauc.append(roc_score)
    paperxgbacc.append(acc_score)

    # lrbins model
    model = LRBinsModel(
        n_bin_features=hyperparameters["lrbins_n_bin_features"],
        n_inference_features=hyperparameters["lrbins_n_inference_features"],
    )
    model.fit(X_train, y_train)
    results = model.performance(X_test, y_test)
    paperlrwbinsrocauc.append(results["rocauc"])
    paperlrwbinsacc.append(results["accuracy"])

    # lr model
    important_features_X_train = model.get_important_features(
        X_train, model.feature_importances, model.n_inference_features
    )
    important_features_X_test = model.get_important_features(
        X_test, model.feature_importances, model.n_inference_features
    )
    lr_clf = LogisticRegression()
    lr_clf.fit(important_features_X_train, y_train)
    y_probs = lr_clf.predict_proba(important_features_X_test)[:, 1]
    y_preds = lr_clf.predict(important_features_X_test)
    y_test = y_test.astype(int)
    y_preds = y_preds.astype(int)
    lr_roc_score = roc_auc_score(y_test, y_probs)
    lr_acc_score = accuracy_score(y_test, y_preds)
    paperlrrocauc.append(lr_roc_score)
    paperlracc.append(lr_acc_score)

print(f"datasetname:{datasetname}")
print(f"lr rocauc:{np.mean(paperlrrocauc):.3f} +/- {np.std(paperlrrocauc):.3f}")
print(f"lrwbins rocauc:{np.mean(paperlrwbinsrocauc):.3f} +/- {np.std(paperlrwbinsrocauc):.3f}")
print(f"xgb rocauc:{np.mean(paperxgbrocauc):.3f} +/- {np.std(paperxgbrocauc):.3f}")
print(f"lr acc:{np.mean(paperlracc):.3f} +/- {np.std(paperlracc):.3f}")
print(f"lrwbins acc:{np.mean(paperlrwbinsacc):.3f} +/- {np.std(paperlrwbinsacc):.3f}")
print(f"xgb acc:{np.mean(paperxgbacc):.3f} +/- {np.std(paperxgbacc):.3f}")